{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# A1  nfu.edu.cn \n",
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/gjdt/index.htm\")\n",
    "r.status_code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_高教动态.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_高教动态.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'https://www.nfu.edu.cn/gjdt/309be8b078444044b51624f0e186729e.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba7cbbc80e65b871.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88bcebb750c5dcc33.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/20dc120c250642cca5815c93591bb5cb.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/b43531427fb44695bbb0e16280988965.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/1509f4f3bc2f4babbe57c7ec29854807.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/a4b2fb3dacae4564976e9a951bcddcff.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/0e7d664c116f4a0ab9e6d165977f9def.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/71e152ddce12414388346126ba1a1b6b.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/0d7bfc95f70841a6b5c2ad85f74c3510.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/750c396e278446e687a42965b1c9a385.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/cd3ecf8986ad40e991e37f635f1ab8b4.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/3a7835bded2441aeb67bbc6f2a61471c.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/4d0eb3a8b8ee47f6b9bf0e9fda26fa4a.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/355e3d6207974a3ea62ef78d2ecc2f23.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/9b6486d83f454a2ca7a0267169ca534d.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/6bb7172f46b3458b8022b1132593b96b.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/a8f6f2c6a2c644d2998d7836f8151f58.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/f978598b61024bb2982f5fbb32f81b9d.htm'},\n",
       " {'https://www.nfu.edu.cn/gjdt/3d9e9dfac15945938692c88167347ee7.htm'}]"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_URL=[i.absolute_links for i in r.html.xpath('//div[@class=\"news_title\"]/a')]\n",
    "list_URL\n",
    "# 绝对路径，可直接得出链接，不需要再去拼接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/gjdt/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# # 解析\n",
    "# base_url = r.url\n",
    "# nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "# nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/gjdt/309be8b078444044b51624f0e186729e.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba7cbbc80e65b871.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88bcebb750c5dcc33.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/20dc120c250642cca5815c93591bb5cb.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/b43531427fb44695bbb0e16280988965.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/1509f4f3bc2f4babbe57c7ec29854807.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/a4b2fb3dacae4564976e9a951bcddcff.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/0e7d664c116f4a0ab9e6d165977f9def.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/71e152ddce12414388346126ba1a1b6b.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/0d7bfc95f70841a6b5c2ad85f74c3510.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/750c396e278446e687a42965b1c9a385.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/cd3ecf8986ad40e991e37f635f1ab8b4.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/3a7835bded2441aeb67bbc6f2a61471c.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/4d0eb3a8b8ee47f6b9bf0e9fda26fa4a.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/355e3d6207974a3ea62ef78d2ecc2f23.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/9b6486d83f454a2ca7a0267169ca534d.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/6bb7172f46b3458b8022b1132593b96b.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/a8f6f2c6a2c644d2998d7836f8151f58.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/f978598b61024bb2982f5fbb32f81b9d.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/3d9e9dfac15945938692c88167347ee7.htm']"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# # 重组链接\n",
    "# list_URL  = [urllib.parse.urlunparse\\\n",
    "# ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "# for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "# list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>大学生暑期实践调查：四成大学生不满实习工作</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/309be8b078444044b...</td>\n",
       "      <td>2016-05-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>部分地区大学生就业呈“倒金字塔” 学历越高越慌</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/159b20971f8b4051b...</td>\n",
       "      <td>2016-05-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>高校迎来史上超长寒假</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88...</td>\n",
       "      <td>2016-05-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>教育部：考研专业成绩优异或可“破格”复试</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/20dc120c250642cca...</td>\n",
       "      <td>2016-05-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>建设大学群为创新驱动提供强大支撑</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/b43531427fb44695b...</td>\n",
       "      <td>2016-05-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>大学生就业新措施将带来哪些利好？ 专家解读促进高校毕业生就业创业新政策</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/1509f4f3bc2f4babb...</td>\n",
       "      <td>2016-05-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>广东2016年招生工作规定出台 三类人群可加分</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/a4b2fb3dacae45649...</td>\n",
       "      <td>2016-05-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>中国将支持中西部省份建设一批高水平大学</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/0e7d664c116f4a0ab...</td>\n",
       "      <td>2016-05-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>上海大力打造高校“高峰”“高原”学科</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/71e152ddce1241438...</td>\n",
       "      <td>2016-05-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>我国出国留学与归国就业人数之差逐年缩小</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/0d7bfc95f70841a6b...</td>\n",
       "      <td>2016-04-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>盘点2015高等教育十大关键词</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/750c396e278446e68...</td>\n",
       "      <td>2016-04-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>“与创业大学生一起成长很过瘾”</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/cd3ecf8986ad40e99...</td>\n",
       "      <td>2016-04-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>中国内地中外合作办学机构项目逾两千 在校生55万人</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/3a7835bded2441aeb...</td>\n",
       "      <td>2016-04-18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>大学生一日三餐点外卖，不是简单事</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/4d0eb3a8b8ee47f6b...</td>\n",
       "      <td>2016-04-18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>教育部：高校招生未经批准不得预留机动计划</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/355e3d6207974a3ea...</td>\n",
       "      <td>2016-04-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>广东拟调整公办普通高校学费标准 月底将召开听证会</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/9b6486d83f454a2ca...</td>\n",
       "      <td>2016-04-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>50多所高校开招农村生 部分名校取消“到校复试”</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/6bb7172f46b3458b8...</td>\n",
       "      <td>2016-04-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>我国基础学科发展现：原创不强 模仿较普遍</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/a8f6f2c6a2c644d29...</td>\n",
       "      <td>2016-04-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>培养创新人才能成就创新经济</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/f978598b61024bb29...</td>\n",
       "      <td>2016-04-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>中国16年间新建本科院校400余所</td>\n",
       "      <td>{https://www.nfu.edu.cn/gjdt/3d9e9dfac15945938...</td>\n",
       "      <td>2016-04-08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     标题  \\\n",
       "0                 大学生暑期实践调查：四成大学生不满实习工作   \n",
       "1               部分地区大学生就业呈“倒金字塔” 学历越高越慌   \n",
       "2                            高校迎来史上超长寒假   \n",
       "3                  教育部：考研专业成绩优异或可“破格”复试   \n",
       "4                      建设大学群为创新驱动提供强大支撑   \n",
       "5   大学生就业新措施将带来哪些利好？ 专家解读促进高校毕业生就业创业新政策   \n",
       "6               广东2016年招生工作规定出台 三类人群可加分   \n",
       "7                   中国将支持中西部省份建设一批高水平大学   \n",
       "8                    上海大力打造高校“高峰”“高原”学科   \n",
       "9                   我国出国留学与归国就业人数之差逐年缩小   \n",
       "10                      盘点2015高等教育十大关键词   \n",
       "11                      “与创业大学生一起成长很过瘾”   \n",
       "12            中国内地中外合作办学机构项目逾两千 在校生55万人   \n",
       "13                     大学生一日三餐点外卖，不是简单事   \n",
       "14                 教育部：高校招生未经批准不得预留机动计划   \n",
       "15             广东拟调整公办普通高校学费标准 月底将召开听证会   \n",
       "16             50多所高校开招农村生 部分名校取消“到校复试”   \n",
       "17                 我国基础学科发展现：原创不强 模仿较普遍   \n",
       "18                        培养创新人才能成就创新经济   \n",
       "19                    中国16年间新建本科院校400余所   \n",
       "\n",
       "                                                   链结          日期  \n",
       "0   {https://www.nfu.edu.cn/gjdt/309be8b078444044b...  2016-05-11  \n",
       "1   {https://www.nfu.edu.cn/gjdt/159b20971f8b4051b...  2016-05-11  \n",
       "2   {https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88...  2016-05-10  \n",
       "3   {https://www.nfu.edu.cn/gjdt/20dc120c250642cca...  2016-05-08  \n",
       "4   {https://www.nfu.edu.cn/gjdt/b43531427fb44695b...  2016-05-04  \n",
       "5   {https://www.nfu.edu.cn/gjdt/1509f4f3bc2f4babb...  2016-05-04  \n",
       "6   {https://www.nfu.edu.cn/gjdt/a4b2fb3dacae45649...  2016-05-03  \n",
       "7   {https://www.nfu.edu.cn/gjdt/0e7d664c116f4a0ab...  2016-05-03  \n",
       "8   {https://www.nfu.edu.cn/gjdt/71e152ddce1241438...  2016-05-03  \n",
       "9   {https://www.nfu.edu.cn/gjdt/0d7bfc95f70841a6b...  2016-04-27  \n",
       "10  {https://www.nfu.edu.cn/gjdt/750c396e278446e68...  2016-04-27  \n",
       "11  {https://www.nfu.edu.cn/gjdt/cd3ecf8986ad40e99...  2016-04-20  \n",
       "12  {https://www.nfu.edu.cn/gjdt/3a7835bded2441aeb...  2016-04-18  \n",
       "13  {https://www.nfu.edu.cn/gjdt/4d0eb3a8b8ee47f6b...  2016-04-18  \n",
       "14  {https://www.nfu.edu.cn/gjdt/355e3d6207974a3ea...  2016-04-14  \n",
       "15  {https://www.nfu.edu.cn/gjdt/9b6486d83f454a2ca...  2016-04-14  \n",
       "16  {https://www.nfu.edu.cn/gjdt/6bb7172f46b3458b8...  2016-04-11  \n",
       "17  {https://www.nfu.edu.cn/gjdt/a8f6f2c6a2c644d29...  2016-04-11  \n",
       "18  {https://www.nfu.edu.cn/gjdt/f978598b61024bb29...  2016-04-11  \n",
       "19  {https://www.nfu.edu.cn/gjdt/3d9e9dfac15945938...  2016-04-08  "
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 输出结果\n",
    "# B-D-1 pd.DataFrame 建构，pandas课有教\n",
    "df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "     } )\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "# B-D-2 pd.DataFrame 输出excel，pandas课有教\n",
    "df.to_excel(\"data_out/nfu_搞笑动态.xlsx\", sheet_name=\"检索结果\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/gjdt/index.htm'"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第一页\n",
    "base_url_01 = r.url\n",
    "base_url_01"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SplitResult(scheme='https', netloc='www.nfu.edu.cn', path='/gjdt/index.htm', query='', fragment='')"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlsplit(base_url_01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/gjdt/index.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               第一页\n",
       "0            https\n",
       "1   www.nfu.edu.cn\n",
       "2  /gjdt/index.htm\n",
       "3                 \n",
       "4                 "
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(urllib.parse.urlsplit(base_url_01)).rename({0:\"第一页\"},axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/gjdt/index2.htm'"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_url_02 = session.get('https://www.nfu.edu.cn/gjdt/index2.htm').url\n",
    "base_url_02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "      <th>第二页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/gjdt/index.htm</td>\n",
       "      <td>/gjdt/index2.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               第一页               第二页\n",
       "0            https             https\n",
       "1   www.nfu.edu.cn    www.nfu.edu.cn\n",
       "2  /gjdt/index.htm  /gjdt/index2.htm\n",
       "3                                   \n",
       "4                                   "
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['第二页'] = urllib.parse.urlsplit(base_url_02)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/gjdt/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break\n",
    "# so page = 19?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/gjdt/index1.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index2.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index3.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index4.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index5.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index6.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index7.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index8.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index9.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index10.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index11.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index12.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index13.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index14.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index15.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index16.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index17.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index18.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index19.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index20.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index21.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index22.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index23.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index24.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index25.htm']"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group = ['https://www.nfu.edu.cn/gjdt/index'+str(i)+'.htm' for i in range(1,26)]\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "url_group.insert(0,'https://www.nfu.edu.cn/gjdt/index.htm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/gjdt/index.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index1.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index2.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index3.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index4.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index5.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index6.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index7.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index8.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index9.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index10.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index11.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index12.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index13.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index14.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index15.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index16.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index17.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index18.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index19.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index20.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index21.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index22.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index23.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index24.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index25.htm']"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/gjdt/index.htm'"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath 准备：\n",
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index3.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>教育部党组《求是》撰文：精心谋划 切实抓好教育系统党史学习教育</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/309be8b078444044b5...</td>\n",
       "      <td>2021-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>教育部长陈宝生：把巩固拓展作为开局之年工作主题，做到6个到位</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba...</td>\n",
       "      <td>2021-03-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>如何建设高质量教育体系？“十四五”规划和2035年远景目标纲要明确了</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88b...</td>\n",
       "      <td>2021-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>教育部长陈宝生《旗帜》撰文：建设高质量教育体系，加快建成教育强国</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/20dc120c250642cca5...</td>\n",
       "      <td>2021-01-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>重磅！《推进粤港澳大湾区高等教育合作发展规划》正式印发</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/b43531427fb44695bb...</td>\n",
       "      <td>2020-12-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>368</th>\n",
       "      <td>8</td>\n",
       "      <td>广东省教育厅：今年毕业生就业形势比去年好</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/3829e4c5df9e460abc...</td>\n",
       "      <td>2014-03-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>369</th>\n",
       "      <td>9</td>\n",
       "      <td>要求职业“高大上” 高校毕业生择业扎堆致就业难</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/776ebc41fae84b36a4...</td>\n",
       "      <td>2014-03-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>370</th>\n",
       "      <td>10</td>\n",
       "      <td>教育部：预计今年贫困地区农村学生上重点高校的人数将比去年增加10%以上</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/41d339ccb3a0464c9c...</td>\n",
       "      <td>2014-03-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>371</th>\n",
       "      <td>11</td>\n",
       "      <td>学位论文如何才能挤出“水分”</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/1e8fa309bcf847b6ad...</td>\n",
       "      <td>2014-03-24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>372</th>\n",
       "      <td>12</td>\n",
       "      <td>高校低年级学生频繁试水招聘会 专家：鼓励提前预热</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/3f34245a7cb449c99b...</td>\n",
       "      <td>2013-03-31</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>513 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                   标题  \\\n",
       "0        0      教育部党组《求是》撰文：精心谋划 切实抓好教育系统党史学习教育   \n",
       "1        1       教育部长陈宝生：把巩固拓展作为开局之年工作主题，做到6个到位   \n",
       "2        2   如何建设高质量教育体系？“十四五”规划和2035年远景目标纲要明确了   \n",
       "3        3     教育部长陈宝生《旗帜》撰文：建设高质量教育体系，加快建成教育强国   \n",
       "4        4          重磅！《推进粤港澳大湾区高等教育合作发展规划》正式印发   \n",
       "..     ...                                  ...   \n",
       "368      8                 广东省教育厅：今年毕业生就业形势比去年好   \n",
       "369      9              要求职业“高大上” 高校毕业生择业扎堆致就业难   \n",
       "370     10  教育部：预计今年贫困地区农村学生上重点高校的人数将比去年增加10%以上   \n",
       "371     11                       学位论文如何才能挤出“水分”   \n",
       "372     12             高校低年级学生频繁试水招聘会 专家：鼓励提前预热   \n",
       "\n",
       "                                                    链结          日期  \n",
       "0    https://www.nfu.edu.cn/gjdt/309be8b078444044b5...  2021-04-08  \n",
       "1    https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba...  2021-03-20  \n",
       "2    https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88b...  2021-03-15  \n",
       "3    https://www.nfu.edu.cn/gjdt/20dc120c250642cca5...  2021-01-05  \n",
       "4    https://www.nfu.edu.cn/gjdt/b43531427fb44695bb...  2020-12-22  \n",
       "..                                                 ...         ...  \n",
       "368  https://www.nfu.edu.cn/gjdt/3829e4c5df9e460abc...  2014-03-28  \n",
       "369  https://www.nfu.edu.cn/gjdt/776ebc41fae84b36a4...  2014-03-27  \n",
       "370  https://www.nfu.edu.cn/gjdt/41d339ccb3a0464c9c...  2014-03-25  \n",
       "371  https://www.nfu.edu.cn/gjdt/1e8fa309bcf847b6ad...  2014-03-24  \n",
       "372  https://www.nfu.edu.cn/gjdt/3f34245a7cb449c99b...  2013-03-31  \n",
       "\n",
       "[513 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/gjdt/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/gjdt/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/高教动态.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='媒体报道')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
